import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from pandasql import sqldf
import plotly.express as px
import pandas as pd
df = pd.read_csv("Documents//Python//bank_transactions.csv")
df.head()
| TransactionID | CustomerID | CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount (INR) | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | T1 | C5841053 | 10/1/94 | F | JAMSHEDPUR | 17819.05 | 2/8/16 | 143207 | 25.0 |
| 1 | T2 | C2142763 | 4/4/57 | M | JHAJJAR | 2270.69 | 2/8/16 | 141858 | 27999.0 |
| 2 | T3 | C4417068 | 26/11/96 | F | MUMBAI | 17874.44 | 2/8/16 | 142712 | 459.0 |
| 3 | T4 | C5342380 | 14/9/73 | F | MUMBAI | 866503.21 | 2/8/16 | 142714 | 2060.0 |
| 4 | T5 | C9031234 | 24/3/88 | F | NAVI MUMBAI | 6714.43 | 2/8/16 | 181156 | 1762.5 |
df.rename(columns = {'TransactionAmount (INR)':'TransactionAmount'}, inplace = True)
df.head()
| TransactionID | CustomerID | CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | T1 | C5841053 | 10/1/94 | F | JAMSHEDPUR | 17819.05 | 2/8/16 | 143207 | 25.0 |
| 1 | T2 | C2142763 | 4/4/57 | M | JHAJJAR | 2270.69 | 2/8/16 | 141858 | 27999.0 |
| 2 | T3 | C4417068 | 26/11/96 | F | MUMBAI | 17874.44 | 2/8/16 | 142712 | 459.0 |
| 3 | T4 | C5342380 | 14/9/73 | F | MUMBAI | 866503.21 | 2/8/16 | 142714 | 2060.0 |
| 4 | T5 | C9031234 | 24/3/88 | F | NAVI MUMBAI | 6714.43 | 2/8/16 | 181156 | 1762.5 |
df.describe()
| CustAccountBalance | TransactionTime | TransactionAmount | |
|---|---|---|---|
| count | 1.046198e+06 | 1.048567e+06 | 1.048567e+06 |
| mean | 1.154035e+05 | 1.570875e+05 | 1.574335e+03 |
| std | 8.464854e+05 | 5.126185e+04 | 6.574743e+03 |
| min | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 |
| 25% | 4.721760e+03 | 1.240300e+05 | 1.610000e+02 |
| 50% | 1.679218e+04 | 1.642260e+05 | 4.590300e+02 |
| 75% | 5.765736e+04 | 2.000100e+05 | 1.200000e+03 |
| max | 1.150355e+08 | 2.359590e+05 | 1.560035e+06 |
df.isnull().sum()
TransactionID 0 CustomerID 0 CustomerDOB 3397 CustGender 1100 CustLocation 151 CustAccountBalance 2369 TransactionDate 0 TransactionTime 0 TransactionAmount 0 dtype: int64
newdf = df.dropna(axis=0, inplace = False) # since there are over a million records dropping a few rows with null values would not significantly impact the analysis
newdf.isnull().sum()
TransactionID 0 CustomerID 0 CustomerDOB 0 CustGender 0 CustLocation 0 CustAccountBalance 0 TransactionDate 0 TransactionTime 0 TransactionAmount 0 dtype: int64
print(newdf.shape,df.shape)
(1041614, 9) (1048567, 9)
print(len(newdf)-len(df))
-6953
newdf.drop(['TransactionID','CustomerID'],axis = 1, inplace = True) ## Removing TransactionID and CustomerID) #as I feel there are no that useful
/opt/anaconda3/lib/python3.9/site-packages/pandas/core/frame.py:4906: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy return super().drop(
newdf.head()
| CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|
| 0 | 10/1/94 | F | JAMSHEDPUR | 17819.05 | 2/8/16 | 143207 | 25.0 |
| 1 | 4/4/57 | M | JHAJJAR | 2270.69 | 2/8/16 | 141858 | 27999.0 |
| 2 | 26/11/96 | F | MUMBAI | 17874.44 | 2/8/16 | 142712 | 459.0 |
| 3 | 14/9/73 | F | MUMBAI | 866503.21 | 2/8/16 | 142714 | 2060.0 |
| 4 | 24/3/88 | F | NAVI MUMBAI | 6714.43 | 2/8/16 | 181156 | 1762.5 |
newdf.corr() # there is no significance relation between variables
| CustAccountBalance | TransactionTime | TransactionAmount | |
|---|---|---|---|
| CustAccountBalance | 1.000000 | -0.004012 | 0.062692 |
| TransactionTime | -0.004012 | 1.000000 | 0.008296 |
| TransactionAmount | 0.062692 | 0.008296 | 1.000000 |
newdf.CustomerDOB.value_counts() # abnormally high number of people have birth year as 1800
1/1/1800 56292
1/1/89 809
1/1/90 784
6/8/91 698
1/1/91 665
...
2/12/51 1
20/3/52 1
26/9/47 1
4/10/41 1
24/10/44 1
Name: CustomerDOB, Length: 17233, dtype: int64
newdf = newdf.loc[~(newdf['CustomerDOB'] == '1/1/1800')] # Removing them since it is probably a mistake in the entry books
newdf.CustomerDOB.value_counts()
1/1/89 809
1/1/90 784
6/8/91 698
1/1/91 665
1/1/92 631
...
23/2/05 1
28/11/42 1
23/9/49 1
14/3/40 1
24/10/44 1
Name: CustomerDOB, Length: 17232, dtype: int64
newdf['TransactionDate'] = pd.to_datetime(newdf['TransactionDate'], format = '%d/%m/%y')
newdf['CustomerDOB'] = pd.to_datetime(newdf['CustomerDOB'], format = '%d/%m/%y')
newdf.CustomerDOB # Year 2057 ??
0 1994-01-10
1 2057-04-04
2 1996-11-26
3 1973-09-14
4 1988-03-24
...
1048562 1990-04-08
1048563 1992-02-20
1048564 1989-05-18
1048565 1978-08-30
1048566 1984-03-05
Name: CustomerDOB, Length: 985322, dtype: datetime64[ns]
newdf.loc[newdf['CustomerDOB'].dt.year >= 2021, ['CustomerDOB']] -= pd.DateOffset(years = 100)
newdf.head() # Subtracted 100 years from the records that were wrongly converted!
| CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | |
|---|---|---|---|---|---|---|---|
| 0 | 1994-01-10 | F | JAMSHEDPUR | 17819.05 | 2016-08-02 | 143207 | 25.0 |
| 1 | 1957-04-04 | M | JHAJJAR | 2270.69 | 2016-08-02 | 141858 | 27999.0 |
| 2 | 1996-11-26 | F | MUMBAI | 17874.44 | 2016-08-02 | 142712 | 459.0 |
| 3 | 1973-09-14 | F | MUMBAI | 866503.21 | 2016-08-02 | 142714 | 2060.0 |
| 4 | 1988-03-24 | F | NAVI MUMBAI | 6714.43 | 2016-08-02 | 181156 | 1762.5 |
newdf['Age'] = (pd.to_datetime('today') - newdf['CustomerDOB'])/np.timedelta64(1, 'Y')
newdf.shape
(985322, 8)
newdf.head()
| CustomerDOB | CustGender | CustLocation | CustAccountBalance | TransactionDate | TransactionTime | TransactionAmount | Age | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1994-01-10 | F | JAMSHEDPUR | 17819.05 | 2016-08-02 | 143207 | 25.0 | 27.941459 |
| 1 | 1957-04-04 | M | JHAJJAR | 2270.69 | 2016-08-02 | 141858 | 27999.0 | 64.711550 |
| 2 | 1996-11-26 | F | MUMBAI | 17874.44 | 2016-08-02 | 142712 | 459.0 | 25.063918 |
| 3 | 1973-09-14 | F | MUMBAI | 866503.21 | 2016-08-02 | 142714 | 2060.0 | 48.264942 |
| 4 | 1988-03-24 | F | NAVI MUMBAI | 6714.43 | 2016-08-02 | 181156 | 1762.5 | 33.740346 |
newdf['TransactionAmount'].describe() # As the data is huge we try to take values in transaction above median.
count 9.853220e+05 mean 1.452425e+03 std 6.139765e+03 min 0.000000e+00 25% 1.510000e+02 50% 4.310000e+02 75% 1.125000e+03 max 1.560035e+06 Name: TransactionAmount, dtype: float64
Loc_TA = sqldf('select Age,CustAccountBalance,CustGender,CustLocation,TransactionAmount from newdf where TransactionAmount > 500 group by CustLocation order by TransactionAmount desc')
Loc_TA
| Age | CustAccountBalance | CustGender | CustLocation | TransactionAmount | |
|---|---|---|---|---|---|
| 0 | 41.521477 | 798753.95 | M | GATE NO 4 MUMBAI | 455122.00 |
| 1 | 35.131203 | 90123.52 | M | JAIPUR DURGAPURA | 378415.46 |
| 2 | 45.401092 | 215315.01 | M | ROOMFORD | 117621.00 |
| 3 | 43.667997 | 43113.77 | M | MILTON KEYNES | 104478.00 |
| 4 | 38.189445 | 73542.33 | M | PHASE-01 HINJEWADI PUNE | 100000.00 |
| ... | ... | ... | ... | ... | ... |
| 6662 | 37.587105 | 8387.04 | M | NAGAR CHENNAI | 501.27 |
| 6663 | 30.936729 | 5075.39 | M | THALAYOLAPARAMBU | 501.00 |
| 6664 | 29.471949 | 76438.86 | F | PETHAPUR GANDHI NAGAR | 501.00 |
| 6665 | 69.697278 | 90260.19 | M | DAHISAR E MUMBAI | 501.00 |
| 6666 | 32.313896 | 494674.28 | F | BORIVALI E | 501.00 |
6667 rows × 5 columns
Loc_TA ['TransactionAmount'].median()
1298.0
Loc_TA.shape
(6667, 5)
import plotly.express as px
fig = px.scatter(Loc_TA, x='Age', y='TransactionAmount',color ='CustGender')
fig.update_layout(title_text="Age vs. Transaction amount",title_x=0.5)
fig.show()
Loc_TA_filter= Loc_TA[Loc_TA['TransactionAmount']< 50000]
import plotly.express as px
fig = px.scatter(Loc_TA_filter, x='Age', y='TransactionAmount',color ='CustGender')
fig.update_layout(title_text="Age vs. Transaction amount",title_x=0.5)
fig.show()
labelencoder = LabelEncoder()
labelencoder = LabelEncoder()
Loc_TA['CustGender'] = labelencoder.fit_transform(Loc_TA['CustGender'])
Loc_TA['CustLocation'] = labelencoder.fit_transform(Loc_TA['CustLocation'])
Loc_TA.head()
| Age | CustAccountBalance | CustGender | CustLocation | TransactionAmount | |
|---|---|---|---|---|---|
| 0 | 41.521477 | 798753.95 | 1 | 2060 | 455122.00 |
| 1 | 35.131203 | 90123.52 | 1 | 2542 | 378415.46 |
| 2 | 45.401092 | 215315.01 | 1 | 5140 | 117621.00 |
| 3 | 43.667997 | 43113.77 | 1 | 3700 | 104478.00 |
| 4 | 38.189445 | 73542.33 | 1 | 4673 | 100000.00 |
scaler = StandardScaler()
Loc_TA1= scaler.fit_transform(Loc_TA)
saved_cols = Loc_TA.columns
Loc_TA1 = pd.DataFrame(Loc_TA1, columns = saved_cols)
Loc_TA1
| Age | CustAccountBalance | CustGender | CustLocation | TransactionAmount | |
|---|---|---|---|---|---|
| 0 | 0.483921 | 1.251709 | 0.591728 | -0.661437 | 48.690581 |
| 1 | -0.233442 | -0.020657 | 0.591728 | -0.410995 | 40.430206 |
| 2 | 0.919441 | 0.204128 | 0.591728 | 0.938898 | 12.345767 |
| 3 | 0.724886 | -0.105064 | 0.591728 | 0.190689 | 10.930423 |
| 4 | 0.109872 | -0.050429 | 0.591728 | 0.696250 | 10.448196 |
| ... | ... | ... | ... | ... | ... |
| 6662 | 0.042254 | -0.167417 | 0.591728 | 0.316949 | -0.266625 |
| 6663 | -0.704307 | -0.173363 | 0.591728 | 1.367039 | -0.266654 |
| 6664 | -0.868741 | -0.045228 | -1.689966 | 0.684299 | -0.266654 |
| 6665 | 3.646894 | -0.020412 | 0.591728 | -0.990337 | -0.266654 |
| 6666 | -0.549708 | 0.705725 | -1.689966 | -1.253249 | -0.266654 |
6667 rows × 5 columns
#Finding the optimum number of clusters for k-means classification - ELBOW METHOD
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
kmeans.fit(Loc_TA1)
wcss.append(kmeans.inertia_)
#Plotting the results onto a line graph, allowing us to observe 'The elbow'
plt.plot(range(1, 11), wcss)
plt.title('The elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS') #within cluster sum of squares
plt.show()
from sklearn.cluster import KMeans
# cluster 4 or 8
kmeans = KMeans(n_clusters=6)
kmeans.fit(Loc_TA1)
y_kmeans = kmeans.predict(Loc_TA1)
y_kmeans
array([4, 4, 5, ..., 2, 5, 2], dtype=int32)
df3 = Loc_TA1.assign(clusterNo = y_kmeans, namex = Loc_TA1['CustGender'])
print(df3.head())
Age CustAccountBalance CustGender CustLocation TransactionAmount \ 0 0.483921 1.251709 0.591728 -0.661437 48.690581 1 -0.233442 -0.020657 0.591728 -0.410995 40.430206 2 0.919441 0.204128 0.591728 0.938898 12.345767 3 0.724886 -0.105064 0.591728 0.190689 10.930423 4 0.109872 -0.050429 0.591728 0.696250 10.448196 clusterNo namex 0 4 0.591728 1 4 0.591728 2 5 0.591728 3 5 0.591728 4 5 0.591728
Loc_TA1.head()
| Age | CustAccountBalance | CustGender | CustLocation | TransactionAmount | |
|---|---|---|---|---|---|
| 0 | 0.483921 | 1.251709 | 0.591728 | -0.661437 | 48.690581 |
| 1 | -0.233442 | -0.020657 | 0.591728 | -0.410995 | 40.430206 |
| 2 | 0.919441 | 0.204128 | 0.591728 | 0.938898 | 12.345767 |
| 3 | 0.724886 | -0.105064 | 0.591728 | 0.190689 | 10.930423 |
| 4 | 0.109872 | -0.050429 | 0.591728 | 0.696250 | 10.448196 |
# OTHER TECH: HIERACHICAL DENDROGRAM
from scipy.cluster.hierarchy import dendrogram, linkage
from matplotlib import pyplot as plt
#df = df.set_index('name')
#del df.index.name
linked = linkage(Loc_TA1, 'average')
#linked
#print(type(linked))
plt.figure(figsize=(30, 30))
zx = dendrogram(linked, orientation='top',labels=Loc_TA.index, color_threshold=60, above_threshold_color='red')
plt.show()
# index Number at Leaf
#df = df.set_index('name')
#del df.index.name